###
### Measure tune-out at individual level
###

library(tidyverse)
library(parallel)
library(lubridate)
library(hms)
library(broom)
library(ggthemes)
library(data.table)
library(magrittr)
library(fixest)

### SET WORKING DIRECTORY HERE ###
path_to_archive <- "replication/"
setwd(path_to_archive)


# for timezone adjustments
load("data/dma_timezone.RData"))
dma_timezone <- dma_timezone %>% 
	select(dma_code, timezone) %>%
	mutate(timezone = recode(timezone, ETZ = "America/New_York", CTZ = "America/Chicago", PTZ="America/Los_Angeles")) %>%
	filter(!is.na(dma_code))

dma_timezone %<>% as.data.table 

calls_match <- function(ad_channel, channel) (channel == ad_channel | channel == paste(ad_channel,"DT",sep=""))
rawfile_name <- function(date) paste0("data/FWM/raw/rawxml/FWM_", year(date), str_pad(month(date), pad="0", width=2), str_pad(day(date), pad="0", width=2), "_R.pd.gz")
reffile_name <- function(date) paste0("data/FWM/ref_data/fwm_ref_data_", as.character(date), ".rds")


read_raw <- function(d) {
	cat("\tReading raw data...\n")	
	raw_view <- d %>% rawfile_name %>%
		fread( 
         sep="|", 
         col.names = c("mso", "device_id", "event_date", "event_time", "event_type", "event_value", "event_name", "event_id"),
         colClasses="character"
        ) %>%
		setnames(old=c("event_name", "event_value"), new=c("channel", "channel_num"))

	cat("\tConverting times and dates...\n")
	raw_view[,event_date := ymd(event_date)]
    raw_view[,event_time := hms(hours=as.numeric(substr(event_time,1,2)), minutes=as.numeric(substr(event_time, 3,4)), seconds=as.numeric(substr(event_time, 5,6)))]

	raw_view[channel_num == "65532", event_type := "O"]
	raw_view[event_type != "T", channel := "OFF"]

	ref_day <- reffile_name(d) %>% readRDS %>% 
		as.data.table %>%
		.[,c("device_id","dma_code","zipcode")] %>%
		.[dma_timezone, on="dma_code"]

	raw_view <- ref_day[raw_view,on="device_id"]
	
	# replace missing dmas with LEXINGTON KY (541)
	raw_view[is.na(dma_code), dma_code:="541"]
	raw_view[is.na(timezone), timezone:="America/New_York"]

	raw_view[timezone == "America/New_York", event_time_utc := ymd_hms(paste(event_date, event_time, sep=" "), tz="America/New_York") %>% with_tz("UTC")]
	raw_view[timezone == "America/Chicago", event_time_utc := ymd_hms(paste(event_date, event_time, sep=" "), tz="America/Chicago") %>% with_tz("UTC")]
	raw_view[timezone == "America/Los_Angeles", event_time_utc := ymd_hms(paste(event_date, event_time, sep=" "), tz="America/Los_Angeles") %>% with_tz("UTC")]
	raw_view[is.na(event_time_utc), event_time_utc := ymd_hms(paste(event_date, event_time, sep=" "), tz="America/New_York") %>% with_tz("UTC")]


	cat("\tAppending end-of-day events...\n")
	end_of_day <- data.table(
		event_date = d,
		event_time = hms(hours=rep(23,3),minutes=rep(59,3),seconds=rep(59,3)),
		timezone = c("America/New_York", "America/Chicago", "America/Los_Angeles"))
	end_of_day[, event_time_utc := imap(timezone, ~ ymd_hms(paste(event_date[.y], event_time[.y], sep=" "), tz = .x) %>% with_tz("UTC")) %>% reduce(c) ]


	every_dev_end <- raw_view[!duplicated(device_id),.(device_id,dma_code,zipcode,timezone,mso,event_id)]
	every_dev_end[,event_type:="E"]
	every_dev_end[,event_id:=paste(device_id, "END", sep="_")]
	every_dev_end[,channel_num:=0]
	every_dev_end[,channel:="END"]

	every_dev_end <- end_of_day[every_dev_end, on="timezone"]

	raw_view <- rbind(raw_view, every_dev_end)
	raw_view <- raw_view[order(device_id, event_time_utc),]

	cat("\tGenerating viewing intervals...\n")
	raw_view[, event_time_utc_end := lead(event_time_utc), by = .(device_id)]
	raw_view <- raw_view[!is.na(event_time_utc_end)]

	## cap segment time at 5h (18000s), ~ 99th percentile
	raw_view[event_type=="T", dur := time_length(event_time_utc_end - event_time_utc, unit="second")]
	raw_view[dur > 18000, `:=` (dur = 18000, event_time_utc_end = event_time_utc + dseconds(18000)) ]

	raw_view[event_type=="T"]
}


get_tune_out <- function(t_c, ad_time, ad_dur, tuning) {
	tune <- tuning[t_c[group=="T"], on="device_id",nomatch=0]

	# get actual tune-outs during ad by T group
	tune %>%
		.[,tuned_out := as.numeric(event_time_utc > ad_time & event_time_utc <= ad_time + seconds(ad_dur))] %>%
		setkey(device_id, event_time_utc) %>%
		.[,.(tuned_out = max(tuned_out), s_watched = as.numeric(min(ad_time + ad_dur, event_time_utc[tuned_out==1]) - ad_time, units="secs")), by = .(device_id)] %>%
		as_tibble
}

# day loop
tuneout_day <- function(date) {
	cat(as.character(date), "\n")

	# load treatment / control group assignment (by device)
	t_c_df <- readRDS(paste0("data/dd/tc_groups_", as.character(date), ".rds")) %>%
		filter(map_int(t_c, ~sum(.$group=="T")) > 0)

	# load daily viewing data
	# for the union of devices in all T groups
	cat("\tReading tuning data...\n")
	tune_day <- read_raw(date)

	devices <- t_c_df$t_c %>%
		rbindlist %>%
		.[group=="T"] %>% 
		unique(by="device_id") %>%
		.[,.(device_id)]

	tune_day %<>% .[devices, on="device_id", nomatch=0]

	# loop over ads: for each ad, pull T group
	# get indicator for tuning out during ad air time 
	# compare to randomly selected time when tuned in
	cat("\tLooping over ads...\n")
	out <- t_c_df %>% 
			mutate(tune_out = pmap(list(t_c=t_c, ad_time=event_time_utc), 
				   get_tune_out,  
				   tuning=tune_day,
				   ad_dur = 30)) %>% 
			select(-t_c) %>%
			unnest(cols = c(tune_out))

	out %>% saveRDS(paste0("data/tuneout/indiv_tuneout_", as.character(date), ".rds"))

	out 
}

set.seed(-93623779)
dates <- seq(mdy("09/01/2012"), mdy("11/06/2012"), by = "days")


tuneout_all <- dates %>% map_dfr(tuneout_day)
save(tuneout_all, file="data/tuneout/indiv_tuneout_all.RData")